package cx.threeg.dcms.core;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HtmlLightParser {

    public static class LinkItem {
        public String title;
        public String href;
        public String subtitle;
        public String thumb;
        public LinkItem(String t, String h, String s) { title=t; href=h; subtitle=s; }
        public LinkItem(String t, String h, String s, String th) { title=t; href=h; subtitle=s; thumb=th; }
    }

    public static List<LinkItem> extractLinks(String html, String baseUrl, String[] keywords) {
        List<LinkItem> out = new ArrayList<LinkItem>();
        if (html == null) return out;
        String body = html.replaceAll("(?is)<script.*?</script>", "").replaceAll("(?is)<style.*?</style>", "");

        String globalThumb = matchFirst(body, "(?is)<meta[^>]+property=['\"]og:image['\"][^>]+content=['\"]([^'\"]+)['\"]");
        if (globalThumb != null && globalThumb.startsWith("/")) globalThumb = baseUrl + globalThumb;

        Pattern p = Pattern.compile("(?is)<a[^>]*href=['\"]([^'\"]+)['\"][^>]*>(.*?)</a>");
        Matcher m = p.matcher(body);
        while (m.find()) {
            String href = m.group(1);
            String inner = m.group(2).replaceAll("<.*?>", "").trim();
            if (inner.length() == 0) continue;
            if (keywords != null && keywords.length > 0) {
                boolean matched = false;
                for (int ki=0; ki<keywords.length; ki++) {
                    String k = keywords[ki];
                    if (k == null) continue;
                    k = k.trim();
                    if (k.length()==0) continue;
                    if (inner.toLowerCase().contains(k.toLowerCase())) { matched = true; break; }
                }
                if (!matched) continue;
            }
            if (href.startsWith("/")) href = baseUrl + href;
            else if (href.startsWith("./")) href = baseUrl + href.substring(1);

            String around = sliceAround(body, m.start(), 300);
            String th = matchFirst(around, "(?is)<img[^>]+src=['\"]([^'\"]+)['\"]");
            if (th == null) th = globalThumb;
            if (th != null && th.startsWith("/")) th = baseUrl + th;
            out.add(new LinkItem(inner, href, href, th));
        }
        return out;
    }

    private static String matchFirst(String text, String pattern) {
        try {
            Pattern p = Pattern.compile(pattern);
            Matcher m = p.matcher(text);
            if (m.find()) return m.group(1);
        } catch (Throwable t) { }
        return null;
    }

    private static String sliceAround(String text, int center, int span) {
        int s = Math.max(0, center - span);
        int e = Math.min(text.length(), center + span);
        return text.substring(s, e);
    }
}
